library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(viridis)
## Loading required package: viridisLite
"%&%" = function(a,b) paste(a,b,sep="")
date <- Sys.Date()
my.dir <- "/home/wheelerlab3/mesa_analyses/"
#my.dir <- "~/mount/wheelerlab3/mesa_analyses/"
#read in BSLMM
for(pop in c('AFA','CAU','HIS')){
filelist <- scan(my.dir %&% "BSLMM_exp/" %&% pop %&% "filelist", "c")
for(file in filelist){
a <- fread(my.dir %&% "BSLMM_exp/" %&% file) %>%
dplyr::mutate(pop=pop)
if(exists('bslmm')){
bslmm <- rbind(bslmm, a)
}else{
bslmm <- a
}
}
}
#read in BVSR
for(pop in c('AFA','CAU','HIS')){
filelist <- scan(my.dir %&% "BVSR_exp/" %&% pop %&% "filelist", "c")
for(file in filelist){
a <- fread(my.dir %&% "BVSR_exp/" %&% file) %>%
dplyr::mutate(pop=pop)
if(exists('bvsr')){
bvsr <- rbind(bvsr, a)
}else{
bvsr <- a
}
}
}
#join by ensg
all <- left_join(bslmm,bvsr,by=c('gene','pop'))
summary(lm(hh50~pve50,all))
##
## Call:
## lm(formula = hh50 ~ pve50, data = all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.248485 -0.006475 0.001618 0.008398 0.243683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0115639 0.0001198 -96.55 <2e-16 ***
## pve50 0.9918154 0.0005807 1707.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01563 on 29389 degrees of freedom
## Multiple R-squared: 0.99, Adjusted R-squared: 0.99
## F-statistic: 2.917e+06 on 1 and 29389 DF, p-value: < 2.2e-16
summary(lm(hh50~pve50,all))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01156387 0.0001197765 -96.54541 0
## pve50 0.99181541 0.0005807203 1707.90564 0
cor.test(all$hh50,all$pve50)
##
## Pearson's product-moment correlation
##
## data: all$hh50 and all$pve50
## t = 1707.9, df = 29389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9948848 0.9951129
## sample estimates:
## cor
## 0.9950001
cor.test(all$hh50,all$pve50,method='s')
## Warning in cor.test.default(all$hh50, all$pve50, method = "s"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: all$hh50 and all$pve50
## S = 1.0677e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9747682
#make plot like this for elastic net alpha comparison? LASSO vs. EN (colored by alpha=0.5 or 0.05)
ggplot(all,aes(x=pve50,y=hh50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) +
xlab('BSLMM PVE') + ylab('BVSR PVE')

ggplot(all,aes(x=pve50,y=hh50,color=pge50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0)

ggplot(all,aes(x=pve50,y=hh50,color=snp50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0)

ggplot(all,aes(x=pve50,y=hh50,color=n_gamma50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0)

cor.test(all$snp50,all$n_gamma50)
##
## Pearson's product-moment correlation
##
## data: all$snp50 and all$n_gamma50
## t = 13.643, df = 29389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06796058 0.09068185
## sample estimates:
## cor
## 0.07933152
cor.test(all$snp50,all$n_gamma50,method='s')
## Warning in cor.test.default(all$snp50, all$n_gamma50, method = "s"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: all$snp50 and all$n_gamma50
## S = 3.1348e+12, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2591827
ggplot(all,aes(x=n_gamma50, y=snp50)) + geom_density_2d() + facet_wrap(~pop)

ggplot(all,aes(x=n_gamma50, y=snp50, color=hh50)) + geom_point() + facet_wrap(~pop)

data <- all %>% mutate(position=1:length(pve50),`medianSNPs<=10`=n_gamma50<=10,LCS=factor(pge025<=0.01,labels=c('> 0.01','<= 0.01')))
ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point() + theme_bw(12) + xlab("PVE") + ylab("PGE") + theme(legend.position = c(1,0),legend.justification = c(1,0)) + facet_wrap(~pop)

ngenes <- dim(all)[1]/3
sorted <- dplyr::arrange(all,pop,pve50) %>% mutate(order=rep(c(1:ngenes),3))
ggplot(sorted,aes(x=order,y=pve50,ymin=pve025,ymax=pve975)) + geom_pointrange(col='gray') + geom_point() + facet_wrap(~pop)

sorted <- dplyr::arrange(all,pop,hh50) %>% mutate(order=rep(c(1:ngenes),3))
ggplot(sorted,aes(x=order,y=hh50,ymin=hh025,ymax=hh975)) + geom_pointrange(col='gray') + geom_point() + facet_wrap(~pop)

BSLMM Sup Fig
pve <- dplyr::select(all, pop, gene, pve50, hh50)
h2_afa <- read.table(my.dir %&% "GCTA_exp/AFA_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='AFA') %>% dplyr::select(ensid,h2,pop)
h2_cau <- read.table(my.dir %&% "GCTA_exp/CAU_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='CAU') %>% dplyr::select(ensid,h2,pop)
h2_his <- read.table(my.dir %&% "GCTA_exp/HIS_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='HIS') %>% dplyr::select(ensid,h2,pop)
h2_res <- rbind(h2_afa,h2_cau,h2_his)
afa <- fread("/home/lauren/files_for_revisions_plosgen/fst_results/fst_table_AFA.txt")
mean_afa <- afa[,list(R2_AFA=mean(R2_AFA, na.rm = TRUE), R2_CAU=mean(R2_CAU, na.rm = TRUE),
R2_HIS=mean(R2_HIS, na.rm = TRUE)),
by=GENE]
r2_afa <- dplyr::select(mean_afa,GENE,R2_AFA) %>% mutate(R2=R2_AFA,pop="AFA") %>% dplyr::select(GENE,R2,pop)
r2_cau <- dplyr::select(mean_afa,GENE,R2_CAU) %>% mutate(R2=R2_CAU,pop="CAU") %>% dplyr::select(GENE,R2,pop)
r2_his <- dplyr::select(mean_afa,GENE,R2_HIS) %>% mutate(R2=R2_HIS,pop="HIS") %>% dplyr::select(GENE,R2,pop)
r2_res <- rbind(r2_afa,r2_cau,r2_his)
mega <- left_join(pve, h2_res, by=c('gene'='ensid','pop'))
## Warning: Column `gene`/`ensid` joining character vector and factor,
## coercing into character vector
mega <- left_join(mega,r2_res,by=c('gene'='GENE','pop'))
#rm NAs for plotting
mega <- mega[complete.cases(mega),]
###need to make h2 and R2 long to facet
b <- ggplot(mega,aes(x=pve50,y=h2,col=R2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y=expression(paste('GCTA ', h^2)),col=expression(R^2),title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
#make plot like this for elastic net alpha comparison? LASSO vs. EN (colored by alpha=0.5 or 0.05)
c <- ggplot(mega,aes(x=pve50,y=hh50,col=R2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y='BVSR PVE',col=expression(R^2),title='C') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) +
scale_color_viridis()
a <- ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point() + theme_bw(14) + xlab("BLSMM PVE") + ylab("BSLMM PGE") + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + facet_wrap(~pop) + ggtitle('A') +
scale_color_viridis(discrete=TRUE)
cor(mega[,-1:-2])
## pve50 hh50 h2 R2
## pve50 1.0000000 0.9950785 0.9660910 0.9133785
## hh50 0.9950785 1.0000000 0.9626454 0.9131412
## h2 0.9660910 0.9626454 1.0000000 0.8926461
## R2 0.9133785 0.9131412 0.8926461 1.0000000
#NEXT: add correlations to A,B
grid.arrange(a, b, c, nrow=3)

plot BSLMM and ENET
#read in alpha=1
for(pop in c('AFA','CAU','HIS')){
a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output_alphas/' %&% pop %&%
'_nested_cv_all_model_summaries_10_peer_3pcs_a1.txt') %>%
dplyr::mutate(pop=pop,test_R2_avg_1=ifelse(test_R2_avg<0,0,test_R2_avg),cv_R2_avg_1=ifelse(cv_R2_avg<0,0,cv_R2_avg), in_sample_R2_1=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_1,cv_R2_avg_1,in_sample_R2_1)
if(exists('alpha1')){
alpha1 <- rbind(alpha1, a)
}else{
alpha1 <- a
}
}
#read in alpha=0.05
for(pop in c('AFA','CAU','HIS')){
a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output_alphas/' %&% pop %&%
'_nested_cv_all_model_summaries_10_peer_3pcs_a0.05.txt') %>%
dplyr::mutate(pop=pop,test_R2_avg_05=ifelse(test_R2_avg<0,0,test_R2_avg),cv_R2_avg_05=ifelse(cv_R2_avg<0,0,cv_R2_avg), in_sample_R2_05=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_05, cv_R2_avg_05,in_sample_R2_05)
if(exists('alpha05')){
alpha05 <- rbind(alpha05, a)
}else{
alpha05 <- a
}
}
#read in alpha=0.5
for(pop in c('AFA','CAU','HIS')){
a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output/' %&% pop %&%
'_nested_cv_all_model_summaries_10_peer_3pcs.txt') %>%
dplyr::mutate(pop=pop,test_R2_avg_5=ifelse(test_R2_avg<0,0,test_R2_avg),cv_R2_avg_5=ifelse(cv_R2_avg<0,0,cv_R2_avg), in_sample_R2_5=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_5,cv_R2_avg_5,in_sample_R2_5)
if(exists('alpha5')){
alpha5 <- rbind(alpha5, a)
}else{
alpha5 <- a
}
}
enet <- left_join(alpha05,alpha5,by=c("gene_id","pop"))
enet <- left_join(enet,alpha1,by=c("gene_id","pop"))
giga <- left_join(enet,mega,by=c("gene_id"="gene","pop"))
#rm(alpha05,alpha5,alpha1)
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_05)) + geom_point() + facet_wrap(~pop) +
geom_abline(slope=1,intercept=0,col='blue') + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).

table(giga$test_R2_avg_5 >= giga$test_R2_avg_05,useNA="i")
##
## FALSE TRUE <NA>
## 6674 22410 1
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_1)) + geom_point() + facet_wrap(~pop) +
geom_abline(slope=1,intercept=0,col='blue') + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).

table(giga$test_R2_avg_05 >= giga$test_R2_avg_1,useNA="i")
##
## FALSE TRUE <NA>
## 7858 21226 1
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_1)) + geom_point() + facet_wrap(~pop) +
geom_abline(slope=1,intercept=0,col='blue')
## Warning: Removed 1 rows containing missing values (geom_point).

table(giga$test_R2_avg_5 >= giga$test_R2_avg_1,useNA="i")
##
## FALSE TRUE <NA>
## 7218 21866 1
ggplot(giga, aes(x=test_R2_avg_1,y=test_R2_avg_1-test_R2_avg_05)) + geom_point() + facet_wrap(~pop) +
geom_hline(yintercept = 0,col='blue')
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(giga, aes(x=test_R2_avg_1,y=test_R2_avg_1-test_R2_avg_5)) + geom_point() + facet_wrap(~pop) +
geom_hline(yintercept = 0,col='blue')
## Warning: Removed 1 rows containing missing values (geom_point).

#bland-altman
ba1 <- left_join(alpha1,alpha5,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_1-test_R2_avg_5) %>%
select(gene_id,pop,alpha.y,test_R2_avg_1,diffR2)
ba2 <- left_join(alpha1,alpha05,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_1-test_R2_avg_05) %>%
select(gene_id,pop,alpha.y,test_R2_avg_1,diffR2)
ba <- rbind(ba1,ba2) %>% mutate(alpha = factor(alpha.y,levels=c('0.05','0.5')))
ggplot(ba, aes(x=test_R2_avg_1,y=diffR2,col=alpha)) + geom_point() + scale_color_viridis(discrete=TRUE) +
facet_wrap(~pop) + theme_bw(14)
## Warning: Removed 2 rows containing missing values (geom_point).

BSLMM plot with test_avg_R2
#c
ggplot(giga,aes(x=pve50,y=hh50,col=test_R2_avg_5)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y='BVSR PVE',col=expression(R^2),title='C') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) +
scale_color_viridis()
## Warning: Removed 2805 rows containing missing values (geom_point).

#a
ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point() + theme_bw(14) + xlab("BLSMM PVE") + ylab("BSLMM PGE") + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + facet_wrap(~pop) + ggtitle('A') +
scale_color_viridis(discrete=TRUE)

#b
ggplot(giga,aes(x=pve50,y=h2,col=test_R2_avg_5)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y=expression(paste('GCTA ', h^2)),col=expression(R^2),title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 2805 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_05,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_05,col=h2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='GCTA\nh2',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_1,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',1,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_1,y=test_R2_avg_05,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',1,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 1 rows containing missing values (geom_point).

keygiga <- dplyr::select(giga,test_R2_avg_05,test_R2_avg_5,test_R2_avg_1,pve50,hh50,h2)
cor(keygiga, use='pairwise')
## test_R2_avg_05 test_R2_avg_5 test_R2_avg_1 pve50
## test_R2_avg_05 1.0000000 0.9824312 0.9773138 0.9206417
## test_R2_avg_5 0.9824312 1.0000000 0.9848480 0.9271681
## test_R2_avg_1 0.9773138 0.9848480 1.0000000 0.9236737
## pve50 0.9206417 0.9271681 0.9236737 1.0000000
## hh50 0.9267777 0.9324281 0.9287428 0.9950785
## h2 0.8929124 0.8960052 0.8922976 0.9660910
## hh50 h2
## test_R2_avg_05 0.9267777 0.8929124
## test_R2_avg_5 0.9324281 0.8960052
## test_R2_avg_1 0.9287428 0.8922976
## pve50 0.9950785 0.9660910
## hh50 1.0000000 0.9626454
## h2 0.9626454 1.0000000
ggplot(giga,aes(x=test_R2_avg_5,y=hh50)) + geom_smooth() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y='BVSR PVE')
## `geom_smooth()` using method = 'gam'
## Warning: Removed 2805 rows containing non-finite values (stat_smooth).

ggplot(giga,aes(x=test_R2_avg_5,y=pve50)) + geom_smooth() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y='BLSMM PVE')
## `geom_smooth()` using method = 'gam'
## Warning: Removed 2805 rows containing non-finite values (stat_smooth).

ggplot(giga,aes(x=test_R2_avg_5,y=h2)) + geom_smooth() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y='GCTA h2')
## `geom_smooth()` using method = 'gam'
## Warning: Removed 2805 rows containing non-finite values (stat_smooth).

abvsr <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='BVSR',PVE=hh50)
abslmm <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='BSLMM',PVE=pve50)
agcta <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='LMM',PVE=h2)
aplot <- rbind(abvsr, abslmm, agcta)
ggplot(aplot, aes(x=test_R2_avg_5,y=PVE,col=model)) + geom_smooth() +
facet_wrap(~pop) + theme_bw() + scale_color_viridis(discrete=TRUE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 8415 rows containing non-finite values (stat_smooth).
